Goal:
Using the data collected from existing customers, build a model that will help the marketing team identify potential customers who are relatively more likely to subscribe term deposit and thus increase their hit ratio.
Resources Available:
The historical data for this project is available in file https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Context:
Deliverable – 1 (Exploratory data quality report reflecting the following)
Deliverable – 2 (Prepare the data for analytics)
Deliverable – 3 (create the ensemble model)
Attribute Information:
Input variables: Bank client data:
Related to previous contact:
Other attributes:
Output variable (desired target):
# import the necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # matplotlib.pyplot plots data
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
#from sklearn.feature_extraction.text import CountVectorizer #DT does not take strings as input for the model fit step....
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
import warnings
warnings.filterwarnings("ignore")
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus
import graphviz
class understanding_data:
def __init__(self, raw_df):
self.raw_df = raw_df
self.raw_df_grouped = raw_df.groupby("Target")
self.class_name_no = "no"
self.class_name_yes = "yes"
self.raw_df_grouped_no = self.raw_df_grouped.get_group(self.class_name_no)
self.raw_df_grouped_yes = self.raw_df_grouped.get_group(self.class_name_yes)
def plot_histogram_continuous(self, feature_name, bin_size):
plt.figure()
plt.hist(self.raw_df_grouped_no[feature_name], bins=bin_size, label=self.class_name_no)
plt.hist(self.raw_df_grouped_yes[feature_name], bins=bin_size, label=self.class_name_yes)
plt.legend()
plt.title("Feature Histogram - "+feature_name)
plt.xlabel("Feature values")
plt.ylabel("Count")
def plot_histogram_categorical(self, feature_name):
feature_df = pd.DataFrame()
feature_df["no"] = self.raw_df_grouped_no[feature_name].value_counts()
feature_df["yes"] = self.raw_df_grouped_yes[feature_name].value_counts()
feature_df.plot(kind='bar')
plt.title("Feature Histogram - "+feature_name)
plt.ylabel("Count")
plt.xlabel("Feature unique values")
plt.tight_layout()
### Read csv and get grouped df based on class
raw_df = pd.read_csv('bank-full.csv')
data_analysis_obj = understanding_data(raw_df)
raw_df.shape
raw_df.head()
# univariant Analysis
raw_df.info()
raw_df.groupby('Target').size()
# replace yes=1 & no=0 in Target column
raw_df.replace({'Target':{'no':0,'yes':1}}, inplace=True)
raw_df.head()
#convert categorical variable categories
for feature in raw_df.columns: # Loop through all columns in the dataframe
if raw_df[feature].dtype == 'object': # Only apply for columns with categorical strings
raw_df[feature] = pd.Categorical(raw_df[feature])# Replace strings with an integer
raw_df.head(10)
raw_df.dtypes
raw_df.isnull().values.any() # check for nulls
print(raw_df['job'].value_counts())
raw_df.skew()
raw_df.nunique()
raw_df.describe().T # describe the data
raw_df.columns
#Univariant Analysis
### Feature 1 - AGE
data_analysis_obj.plot_histogram_continuous("age", 50)
### Feature 2 - JOB
data_analysis_obj.plot_histogram_categorical("job")
### Feature 3 - MARITAL
data_analysis_obj.plot_histogram_categorical("marital")
### Feature 4 - EDUCATION
data_analysis_obj.plot_histogram_categorical("education")
### Feature 5 - DEFAULT
data_analysis_obj.plot_histogram_categorical("default")
### Feature 6 - Balance
data_analysis_obj.plot_histogram_continuous("balance", 50)
### Feature 7 - HOUSING
data_analysis_obj.plot_histogram_categorical("housing")
### Feature 8 - LOAN
data_analysis_obj.plot_histogram_categorical("loan")
### Feature 9 - CONTACT
data_analysis_obj.plot_histogram_categorical("contact")
### Feature 10 - DAY
data_analysis_obj.plot_histogram_continuous("day", 50)
### Feature 11 - MONTH
data_analysis_obj.plot_histogram_categorical("month")
### Feature 12 - CAMPAIGN
data_analysis_obj.plot_histogram_continuous("campaign", 30)
### Feature 13 - PDAYS
data_analysis_obj.plot_histogram_continuous("pdays", 30)
### Feature 14 - PREVIOUS
data_analysis_obj.plot_histogram_categorical("previous")
### Feature 15 - POUTCOME
data_analysis_obj.plot_histogram_categorical("poutcome")
Based upon the histograms pdays & default can be dropped
raw_df.drop(['pdays','default'], axis=1, inplace=True)
sns.pairplot(raw_df, hue='Target', diag_kind='hist')
pd.crosstab(raw_df['Target'],[raw_df['job'],raw_df['housing']], rownames=['Target'], colnames=['job','housing'])
oneHotCols=["job","marital","education","housing","loan","contact","day","month","poutcome"]
raw_df=pd.get_dummies(raw_df, columns=oneHotCols, drop_first=True)
fig, ax = plt.subplots()
fig.set_size_inches(20, 8)
sns.countplot(raw_df['age'], hue=raw_df['Target'])
ax.set_xlabel('Age', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Age Count Distribution', fontsize=15)
sns.despine()
# sns.countplot(data['age'], hue=data['Target'])
fig, ax = plt.subplots()
fig.set_size_inches(25, 8)
sns.countplot(raw_df['previous'],hue=raw_df['Target'])
ax.set_xlabel('Previous', fontsize=16)
ax.set_ylabel('Number', fontsize=16)
ax.set_title('Previous', fontsize=16)
ax.tick_params(labelsize=16)
sns.despine()
# Prepre the data for model building
X = raw_df.drop("Target" , axis=1)
y = raw_df.pop("Target")
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)
print(X_train.shape)
print(X_test.shape)
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)
#predict on test
y_predict = model.predict(X_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
model_score = model.score(X_test, y_test)
print(model_score)
# Confusion matrix
pd.crosstab(y_test, y_predict, rownames=['Actual'], colnames=['Predicted'])
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True,fmt='.2f')
# sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
# Visualize model performance with yellowbrick library
from yellowbrick.classifier import ClassificationReport, ROCAUC
viz = ClassificationReport(model)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(model)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
Model is overfit
train_char_label = ['No', 'Yes']
Data_Tree_File = open('data_tree.dot','w')
dot_data = tree.export_graphviz(dTree, out_file=Data_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Data_Tree_File.close()
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng data_tree.dot -o data_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("data_tree.png"))
#use max_depth=3 to fix overfit
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
preds_train = dTreeR.predict(X_train)
preds_test = dTreeR.predict(X_test)
acc_dTreeR = accuracy_score(y_test, preds_test)
acc_dTreeR
Training accuracy is almot equal to test accuracy, which indicates the model is not overfit anymore
features = [col for col in raw_df.columns if col != 'Target']
dot_data = StringIO()
export_graphviz(dTreeR, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['No','Yes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dTreeR.png')
Image(graph.create_png())
# Visualize model performance with yellowbrick library
viz = ClassificationReport(dTreeR)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(dTreeR)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
feat_importance = dTreeR.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(features, dTreeR.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_dTreeR})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(X_train, y_train)
pred_RF = rfcl.predict(X_test)
acc_RF = accuracy_score(y_test, pred_RF)
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(RandomForestClassifier(n_estimators = 50))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(RandomForestClassifier(n_estimators = 50))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=22)
abcl = abcl.fit(X_train, y_train)
pred_AB =abcl.predict(X_test)
acc_AB = accuracy_score(y_test, pred_AB)
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(AdaBoostClassifier(n_estimators= 100, learning_rate=0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(X_train, y_train)
pred_BG = bgcl.predict(X_test)
acc_BG = accuracy_score(y_test, pred_BG)
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(X_train, y_train)
pred_GB = gbcl.predict(X_test)
acc_GB = accuracy_score(y_test, pred_GB)
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
# Visualize model performance with yellowbrick library
viz = ClassificationReport(GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()